Plot with two variables

  1. Scatterplot : when explanatory variable is continuous
  2. box-and-whisker plot : when explanatory variable is categorical
  3. barplot :for categorical variable with more emphasis on effect sizes

Plotting with two continuous variables : Scatterplots

data2 <- read.table("scatter2.txt", header = TRUE)
plot(data2$xv2, data2$ys2, col = "red", xlab = "", ylab = "")
abline(lm(ys2 ~ xv2, data = data2))

# legend(locator(1), c("treatment"), col = "red", pch = 1)


# pch 
plot(0:10, 0:10, xlim = c(0, 32), ylim = c(0, 40), type = "n", xaxt = "n", yaxt = "n",
     xlab ="", ylab = "")
x <- seq(1, 31, 2)
s <- -16
f <- -1
for(y in seq(2, 40, 2.5)){
  s <- s + 16 
  f <- f + 16
  y2 <- rep(y, 16)
  points(x, y2, pch = s:f, cex = 0.7)
  text(x, y - 1, as.character(s:f), cex = 0.6)
}

color for symbols

pch 21 - 25 allows to specify the background(fill) color and the border color separately.

plot(0:9, 0:9, type = "n", xaxt = "n", yaxt = "n", ylab = "col", xlab = "bg")
axis(1, at = 1:8)
axis(2, at = 1:8)
# bg: color for background 
# col: color for border 
for(i in 1:8) points(1:8, rep(i, 8), pch = c(21, 22, 23, 24, 25), bg = 1:8, col = i)
legend("topright", c("test"), pch = 21, pt.bg = 3, col = 4) 

# pt.bg is used for specifying the fill color 

Adding text to scatterplots

map.place <- read.csv("map.places.csv")
map.data <- read.csv("bowens.csv")
map.data$nn <- ifelse(map.data$north < 60, map.data$north + 100, map.data$north)

attach(map.place)
attach(map.data)
# produce a map with places names with corresponding locations 
plot(c(20, 100), c(60, 110), type = "n", xlab = "", ylab = "", xaxt = "n", yaxt = "n")
for(i in 1:length(map.place$wanted)){
  ii <- which(map.data$place == as.character(map.place$wanted[i]))
  text(east[ii], nn[ii], as.character(place[ii]), cex = 0.6)
}

detach(map.place)
detach(map.data)

Identifying individuals in scatterplots

data <- read.table("sleep.txt", header = TRUE)
attach(data)
s <- as.numeric(factor(Subject))
plot(Days, Reaction, type = "n")
for(k in 1:max(s)){
  x <- Days[s == k]
  y <- Reaction[s == k]
  lines(x, y, type = "l", col = "gray")
}

sym <- rep(c(21, 22, 23, 24, 25), c(4, 4, 4, 3, 3))
bcol <- c(2:8, 2:8, 2:5)
for(k in 1:max(s)){
  points(Days[s == k], Reaction[s == k], pch = sym[k], bg = bcol[k], col = 4)
}

detach(data)

Using a third variable to label a scatterplot text()

data <- read.table("pgr.txt", header = TRUE)
names(data) # label with FR 
## [1] "FR"  "hay" "pH"
attach(data)
plot(hay, pH)
text(hay, pH, labels = round(FR, 2), pos = 1, offset = 0.5, cex = 0.7)

# pos = 1 : labels are centered 
# offset = 0.5 : when pos is specified, this value gives the offset of the label from the specified
# coordinate in fractions of a character width.


# use the third variable to choose the color of points 
plot(hay, pH, pch = 16, col = ifelse(FR > median(FR), 3, 4))
legend("topleft", c("FR > median", "FR < median"), col = c(3, 4), pch = 16)

detach(data)

Joining the dots

Need to first order the points on hte x axis in order to connect by lines

smooth <- read.table("smoothing.txt", header = TRUE)
attach(smooth)
## The following objects are masked _by_ .GlobalEnv:
## 
##     x, y
sequence <- order(x)
plot(x, y, pch = 16)
lines(x[sequence], y[sequence])

detach(smooth)

Plotting stepped lines

type : 1-character string giving the type of plot desired.

The following values are possible, for details, see plot:

  1. “p” for points, “l” for lines,

  2. “b” for both points and lines,

  3. “c” for empty points joined by lines,

  4. “o” for overplotted points and lines,

  5. “s” and “S” for stair steps and

  6. “h” for histogram-like vertical lines.

  7. Finally, “n” does not produce any points or lines

xx <- 0:10 
yy <- 0:10
par(mfrow = c(2, 2))
plot(xx, yy)
lines(xx, yy, col = "red")

plot(xx, yy)
lines(xx, yy, col = "blue", type = "s")

plot(xx, yy)
lines(xx, yy, col = "green", type = "S")
par(mfrow = c(1 ,1))

Adding other shapes to a plot

Special objects: * rect : rectangular

plot(0:10, 0:10, xlab = "", ylab = "", xaxt = "n", yaxt = "n", type = "n")
rect(6, 6, 9, 9)
# rect(xleft, ybottom, xright, ytop, density = NULL, angle = 45,
#     col = NA, border = NULL, lty = par("lty"), lwd = par("lwd"),
#     ...)
corners <- function(){
coos <- c(unlist(locator(1)),unlist(locator(1))) 
rect(coos[1],coos[2],coos[3],coos[4])
}

corners()

arrows(1,1,3,8)

arrows(1,9,5,9,code=3)

arrows(4,1,4,6,code=3,angle=90)


# If we want to use the cursor position for the arrow 

# click.arrows <- function(){
# coos <- c(unlist(locator(1)),unlist(locator(1))) arrows(coos[1],coos[2],coos[3],coos[4])
# }

# click.arrows()

# locations <- locator(6)

locations <- vector("list", 2)
class(locations)
## [1] "list"
names(locations) <- c("x", "y")
locations$x <- c(5, 7, 9, 8, 6, 5)
locations$y <- c(4, 4.5, 2, 1, 0.7, 2)
# polygon draws the polygons whose vertices are given in x and y.
polygon(locations,col="lavender")

Draw curverd shapes using polygon

# shade an area below a curve 
z <- seq(-3, 3, 0.1)
pd <- dnorm(z) 
par(mfrow = c(2, 2))
plot(z, pd, type = "l")
# shade it 
polygon(c(z[z <= -1], -1), c(pd[z <= -1], pd[z == -3]), col = "red")


plot(z, pd, type = "l")
# shade it 
polygon(c(z[z <= -1], -1), c(pd[z <= -1], pd[z == -3]), col = "red", border = NA)

plot(z, pd, type = "l")
# shade it 
polygon(c(z[z <= -1], -1), c(pd[z <= -1], pd[z == -3]), density = 10, col = "red") 
# density is in lines per inch 

plot(z, pd, type = "l")
# shade it 
polygon(c(z[z <= -1], -1), c(pd[z <= -1], pd[z == -3]), col = "red", lty = 3)

par(mfrow = c(1, 1))

Drawing mathematical functions

curve(x^3 - 3*x, -2, 2)

# show the effects of lowess, loess, gam and lm 
data <- read.table("jaws.txt", header = TRUE)
attach(data)
par(mfrow = c(2, 2))
plot(age, bone, pch = 16, cex = 0.6, main = "lowess")
lines(lowess(age, bone), col = "red")

plot(age, bone, pch = 16, cex = 0.6, main = "loess")
model <- loess(bone ~ age)
xv <- 0:50
yv <- predict(model, data.frame(age = xv))
lines(xv, yv, col = "red")
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-16. For overview type 'help("mgcv-package")'.
plot(age, bone, pch = 16, cex = 0.6, main = "gam")
model <- gam(bone ~ s(age))
xv <- 0:50
yv <- predict(model, data.frame(age = xv))
lines(xv, yv, col = "red")


plot(age, bone, pch = 16, cex = 0.6, main = "polynomial")
model <- lm(bone ~ age + I(age^2) + I(age^3))
xv <- 0:50
yv <- predict(model, data.frame(age = xv))
lines(xv, yv, col = "red")

par(mfrow = c(1, 1))
# call the "s" function from package "mgcv" 
# s: Function used in definition of smooth terms within gam model formulae. The function does
# not evaluate a (spline) smooth - it exists purely to help set up a model using spline based smooths
detach(data)

Plotting with categorical explanatory variables

weather <- read.table("SilwoodWeather.txt", header = TRUE)
str(weather)
## 'data.frame':    6940 obs. of  5 variables:
##  $ upper: num  10.8 10.5 7.5 6.5 10 8 5.8 2.8 -0.8 1.5 ...
##  $ lower: num  6.5 4.5 -1 -3.3 5 3 -3.3 -5.5 -4.8 -1 ...
##  $ rain : num  12.2 1.3 0.1 1.1 3.5 0.1 0 0 0 0 ...
##  $ month: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ yr   : int  1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
attach(weather)
month <- factor(month)
plot(month, upper, ylab = "daily maximum temp", xlab = "month")  # default plot is box plot for categorical variables 

# notches: boxes which have overlapped notches have no significant different medians under appropriate test.
boxplot(upper ~ month, notch = TRUE, ylab = "daily maximum temp", xlab = "month")

detach(weather)

Barplots with error bars

  • barplots with error bars showing the uncertainty , similar with Chapter 2.15

  • barplots showing the histogram with the help of table

trial <- read.table("compexpt.txt",header=T) 
attach(trial)
names(trial)
## [1] "biomass"  "clipping"
# function to draw error bars 
seBars <- function(x,y){
model <- lm(y ~ factor(x))
reps <- length(y)/length(levels(x)) # replicates 
sem <- summary(model)$sigma/sqrt(reps) # se 
m <- as.vector(tapply(y,x,mean))
upper <- max(m) + sem
nn <- as.character(levels(x))
xs <- barplot(m,ylim=c(0,upper),names=nn,
            ylab=deparse(substitute(y)),xlab=deparse(substitute(x)))
for (i in 1:length(xs)) {
arrows(xs[i],m[i]+sem,xs[i],m[i]-sem,angle=90,code=3,length=0.1) } }

# Add error bars 
seBars(clipping, biomass)

par(mfrow=c(1,2))
plot(clipping,biomass)
plot(clipping,biomass,notch=T)
## Warning in bxp(structure(list(stats = structure(c(415, 417, 443.5, 517, :
## some notches went outside hinges ('box'): maybe set notch=FALSE

detach(trial)

# produce barplots like "hist"
par(mfrow = c(1, 1))
pois <- rbinom(1000, 10, 0.5)
barplot(table(pois), border = NA, col = "wheat2", cex.axis = 0.6)

Plots for multiple comparisons

  • boxplots with notches,

  • Tukey’s honest significant difference : the intervals do NOT overlap the vertical dashed line are significantly different.

data <- read.table("box.txt", header = TRUE)
attach(data)
names(data)
## [1] "fact"     "response"
plot(response ~ factor(fact), notch = TRUE)

# or order the levels for better comparison 
index <- order(tapply(response, fact, median))
ordered <- factor(rep(index, rep(20, 8)))
boxplot(response~ordered,notch=T,names=as.character(index),
  xlab="ranked treatments",ylab="response")

# or use Tukey's honest significant difference 
model <- aov(response ~ factor(fact))
summary(model)
##               Df Sum Sq Mean Sq F value Pr(>F)    
## factor(fact)   7  925.7  132.24   17.48 <2e-16 ***
## Residuals    152 1150.1    7.57                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(TukeyHSD(model), las = 1) 

# Create a set of confidence intervals on the differences between the means of the levels of 
# a factor with the specified family-wise probability of coverage. The intervals are based
# on the Studentized range statistic, Tukey's ‘Honest Significant Difference’ method.
detach(data)

Using color palettes with categorical variables

data <- read.table("silwoodweather.txt", header = TRUE)
attach(data)
## The following object is masked _by_ .GlobalEnv:
## 
##     month
month <- factor(month)
season <- heat.colors(12)

temp1 <- c(rev(5:10), 5:10)
plot(month, upper, col = season[temp1], border = season[temp1], pch = 16)

detach(data)

Plots for single samples

data <- read.table("daphnia.txt", header = TRUE)
attach(data)
names(data)
## [1] "Growth.rate" "Water"       "Detergent"   "Daphnia"
# difference of historgram and barplot 
par(mfrow = c(1, 2))
hist(Growth.rate, seq(0, 8, 0.5), col = "green", main = "")

y <- as.vector(tapply(Growth.rate, list(Daphnia, Detergent), mean))
barplot(y, col = "green", ylab = "Growth rate", xlab = "Treatment")

# the effects of bins 

par(mfrow=c(2,2))
hist(Growth.rate,seq(0,8,0.25),col="green",main="")
hist(Growth.rate,seq(0,8,0.5),col="green",main="")
hist(Growth.rate,seq(0,8,2),col="green",main="")
hist(Growth.rate,c(0,3,4,8),col="green",main="")

# reproduce the fourth plot by using cut 
edges <- c(0,3,4,8)
bin <- cut(Growth.rate,edges)
bin
##  [1] (0,3] (0,3] (3,4] (0,3] (3,4] (4,8] (4,8] (3,4] (4,8] (0,3] (3,4]
## [12] (0,3] (3,4] (4,8] (4,8] (4,8] (4,8] (4,8] (0,3] (3,4] (3,4] (3,4]
## [23] (3,4] (3,4] (3,4] (4,8] (4,8] (0,3] (0,3] (3,4] (3,4] (4,8] (4,8]
## [34] (0,3] (3,4] (4,8] (0,3] (0,3] (3,4] (3,4] (3,4] (4,8] (4,8] (4,8]
## [45] (4,8] (3,4] (0,3] (3,4] (4,8] (4,8] (4,8] (3,4] (4,8] (4,8] (0,3]
## [56] (3,4] (0,3] (4,8] (4,8] (4,8] (0,3] (3,4] (4,8] (0,3] (0,3] (0,3]
## [67] (4,8] (4,8] (4,8] (0,3] (0,3] (0,3]
## Levels: (0,3] (3,4] (4,8]
is.factor(bin)
## [1] TRUE
table(bin)
## bin
## (0,3] (3,4] (4,8] 
##    21    22    29
diff(edges) # Returns suitably lagged and iterated differences
## [1] 3 1 4
(table(bin)/sum(table(bin)))/diff(edges)
## bin
##      (0,3]      (3,4]      (4,8] 
## 0.09722222 0.30555556 0.10069444
par(mfrow = c(1, 1))
barplot((table(bin)/sum(table(bin)))/diff(edges))

# These are the heights of the three bars in the density plot (bottom right, above). 
# They do not add to 1 because the bars are of different widths. It is the total
# area of the three bars that is 1 under this convention.
detach(data)

Histograms of integers

values <- rpois(1000,1.70)
par(mfrow = c(1, 2))
# before specifying bins 
hist(values,main="",xlab="random numbers from a Poisson with mean 1.7")

# after specifying bins to capture frequency of 0 
hist(values,breaks = (-0.5:8.5), main="",
            xlab="random numbers from a Poisson with mean 1.7")

par(mfrow = c(1, 1))

Overlaying histograms with smooth density functions

  • Using lines to add estimated lines

  • Using density function for continuous variables

# add lines by "lines"
y <- rnbinom(200, mu = 1.5, size = 1)
bks <-  -0.5:(max(y) + 0.5)
hist(y, bks, main="")
xs <- 0:11
ys <- dnbinom(xs, size=1.2788, mu=1.772)
lines(xs, ys*200)

# add lines by "density"
library(MASS)
attach(faithful)

# rule of thumb for bankwidth 
(max(eruptions)-min(eruptions))/(2*(1+log(length(eruptions),base=2)))
## [1] 0.192573
# the range 
range(eruptions)[2] -range(eruptions)[1]
## [1] 3.5
# the ideal number of bins 
(range(eruptions)[2] -range(eruptions)[1]) /((max(eruptions)-min(eruptions))/(2*(1+log(length(eruptions),base=2)))
)
## [1] 18.17493
par(mfrow = c(1, 2))
hist(eruptions, 15, freq = FALSE, main = "", col = 27)
lines(density(eruptions, width = 0.6, n = 200)) # specify width = 0.6 
truehist(eruptions, nbins = 15, col = 27)
lines(density(eruptions, n = 200))

par(mfrow = c(1, 1))
detach(faithful)
  • There are 18 bins even we asked for 15 bins.

  • The hists are not extactly the same even for the same data

  • The density curve is better when specifying bandwidth = 0.6

Index plot/ Trace plot

Useful for error checking

response <- read.table("das.txt", header = TRUE)
plot(response$y)

Time series plot

  • ts.plot: works for plotting objects inheriting from class = ts
# ts.plot 
data(UKLungDeaths)
# ts.plot plots several time series on a common plot. 
ts.plot(ldeaths, mdeaths, fdeaths, xlab="year", ylab="deaths", lty=c(1:3))

# produce three time series on the same axes using different line types


# plot.ts 
# works for plotting objects inheriting from class = ts 
data(sunspots)
plot(sunspots)

class(sunspots)
## [1] "ts"
is.ts(sunspots)
## [1] TRUE

Pie charts

Useful to illustrate the proportional make-up of a sample in presentations.

data <- read.csv("piedata.csv")
data
##               names amounts
## 1              coal       4
## 2               oil       2
## 3               gas       1
## 4        oil shales       3
## 5 methyl clathrates       6
# pie(x, labels = names(x), edges = 200, radius = 0.8,
#    clockwise = FALSE, init.angle = if(clockwise) 90 else 0,
#    density = NULL, angle = 45, col = NULL, border = NULL,
#    lty = NULL, main = NULL, ...)

pie(data$amounts, labels = as.character(data$names))

data$amounts
## [1] 4 2 1 3 6
sum(data$amounts)
## [1] 16
pct <- round(data$amounts / sum(data$amounts) *100, 2)
lbs <- paste(data$names, ":", pct, "%", sep = "")
pie(data$amounts, labels = lbs)

# 3D Exploded Pie Chart
library(plotrix)

pie3D(data$amounts, labels = lbs, explode = 0.1)

slices <- c(10, 12, 4, 16, 8) 
lbls <- c("US", "UK", "Australia", "Germany", "France")
pie3D(slices,labels=lbls,explode=0.1,
    main="Pie Chart of Countries ")

# Pie Chart from data frame with Appended Sample Sizes
mytable <- table(iris$Species)
lbls <- paste(names(mytable), "\n", mytable, sep="")
pie(mytable, labels = lbls, 
    main="Pie Chart of Species\n (with sample sizes)")

stripchart function

stripchart produces one dimensional scatter plots (or dot plots) of the given data. These plots are a good alternative to boxplots when sample sizes are small.

data("OrchardSprays")
with(OrchardSprays, 
     stripchart(decrease ~ treatment, 
                ylab = "decrease", xlab = "", 
                vertical = TRUE, log = "y"))

A plot to test normality

# A function that plots a data set and compares it to a plot of normally distributed
# data with the same mean and sd 

normal.plot <- function(y) {
s <- sd(y) 
plot(c(0,3),c(min(0, mean(y)-s * 4 * qnorm(0.75)),max(y)),xaxt="n",xlab="",type="n",ylab="")
#   for your data's boxes and whiskers, centred at x = 1
top <- quantile(y,0.75)
bottom <- quantile(y,0.25)
w1u <- quantile(y,0.91)
w2u <- quantile(y,0.98)
w1d <- quantile(y,0.09)
w2d <- quantile(y,0.02)
rect(0.8,bottom,1.2,top)
lines(c(0.8,1.2),c(mean(y),mean(y)),lty=3)
lines(c(0.8,1.2),c(median(y),median(y)))
lines(c(1,1),c(top,w1u))
lines(c(0.9,1.1),c(w1u,w1u))
lines(c(1,1),c(w2u,w1u),lty=3)
lines(c(0.9,1.1),c(w2u,w2u),lty=3)
nou <- length(y[y>w2u])
points(rep(1,nou),jitter(y[y>w2u]))
lines(c(1,1),c(bottom,w1d))
lines(c(0.9,1.1),c(w1d,w1d))
lines(c(1,1),c(w2d,w1d),lty=3)
lines(c(0.9,1.1),c(w2d,w2d),lty=3)
nod <- length(y[y<w2d])
points(rep(1,nod),jitter(y[y<w2d]))
#for the normal box and whiskers, centred at x = 2
n75 <- mean(y)+ s * qnorm(0.75)
n25 <- mean(y)- s * qnorm(0.75)
n91 <- mean(y)+ s * 2* qnorm(0.75)
n98 <- mean(y)+ s * 3* qnorm(0.75)
n9 <- mean(y)- s * 2* qnorm(0.75)
n2 <- mean(y)- s * 3* qnorm(0.75)
rect(1.8,n25,2.2,n75)
lines(c(1.8,2.2),c(mean(y),mean(y)),lty=3)
lines(c(2,2),c(n75,n91))
lines(c(1.9,2.1),c(n91,n91))
lines(c(2,2),c(n98,n91),lty=3)
lines(c(1.9,2.1),c(n98,n98),lty=3)
lines(c(2,2),c(n25,n9))
lines(c(1.9,2.1),c(n9,n9))
lines(c(2,2),c(n9,n2),lty=3)
lines(c(1.9,2.1),c(n2,n2),lty=3)
lines(c(1.2,1.8),c(top,n75),lty=3,col="gray")
lines(c(1.1,1.9),c(w1u,n91),lty=3,col="gray")
lines(c(1.1,1.9),c(w2u,n98),lty=3,col="gray")
lines(c(1.2,1.8),c(bottom,n25),lty=3,col="gray")
lines(c(1.1,1.9),c(w1d,n9),lty=3,col="gray")
lines(c(1.1,1.9),c(w2d,n2),lty=3,col="gray")
# label the two boxes 
axis(1,c(1,2),c("data","normal")) }


y <- rnbinom(100,1,0.2)

normal.plot(y)

Plot with multiple variables

ozonedata <- read.table("ozone.data.txt", header = TRUE)
attach(ozonedata)
names(ozonedata)
## [1] "rad"   "temp"  "wind"  "ozone"
pairs(ozonedata, panel = panel.smooth)

# coplot 
 coplot(ozone ~ wind|temp, panel = panel.smooth, overlap = -0.05)

## Tonga Trench Earthquakes
coplot(lat ~ long | depth, data = quakes)

given.depth <- co.intervals(quakes$depth, number = 4, overlap = .1)
coplot(lat ~ long | depth, data = quakes, given.v = given.depth, rows = 1)

# ordered from lowerleft to upperright with corresponding temp shown in the upper panel 
# overlap can be adjusted to control the data points in each panel 
detach(ozonedata)

# Interaction plots by "interaction.plot"
yields <- read.table("splityield.txt", header = TRUE)
attach(yields)
names(yields)
## [1] "yield"      "block"      "irrigation" "density"    "fertilizer"
interaction.plot(fertilizer, irrigation, yield)

# interaction.plot(x.factor, trace.factor, response) 
detach(yields)

Special plots

  1. use jitter to add a small noise to break ties

  2. use sunflowerplot

# design plots  with default means plot 
data <- read.table("daphnia.txt", header = TRUE)
attach(data)
plot.design(Growth.rate ~ Water*Detergent*Daphnia)

# median plot 
plot.design(Growth.rate ~ Water*Detergent*Daphnia, fun = median)

# supply anonymous function to plot 
plot.design(Growth.rate ~ Water*Detergent*Daphnia, fun = function(x) sqrt(var(x)/3))

detach(data)

# bubble plot 
# Function to plot the bubble plot centered at the points and with radius corresponding to the third variable 
# xv: x axes points 
# yv: y axes points 
# rv: a third variable related to the radius of the bubbles 

bubble.plot <- function(xv,yv,rv,bs=0.1){ 
      r <- rv/max(rv)
      yscale <- max(yv)-min(yv)
      xscale <- max(xv)-min(xv)
      plot(xv,yv,type="n", xlab=deparse(substitute(xv)), ylab=deparse(substitute(yv)))
for (i in 1:length(xv)) bubble(xv[i],yv[i],r[i],bs,xscale,yscale) }

# function to plot the circles 
bubble <- function (x, y, r, bubble.size, xscale, yscale) {
theta <- seq(0,2*pi, pi/200)
yv <- r*sin(theta)*bubble.size*yscale # a set of points  
xv <- r*cos(theta)* bubble.size*xscale
lines(x + xv,y + yv)
}

# test it 
ddd <- read.table("pgr.txt", header = TRUE)
attach(ddd)
names(ddd)
## [1] "FR"  "hay" "pH"
bubble.plot(hay, pH, FR)

detach(ddd)




# plots with identical values 

numbers <- read.table("longdata.txt", header = TRUE)
attach(numbers)
names(numbers)
## [1] "xlong" "ylong"
plot(jitter(xlong, amount = 1), jitter(ylong, amount = 1), xlab = "input", ylab = "count", pch = 16)

# jitter: Add a small amount of noise to a numeric vector.
# jitter(x, ...) returns a numeric of the same length as x, but with an amount of noise added in order to break ties.


# use sunflowerplot 
sunflowerplot(xlong, ylong)

# Multiple points are plotted as ‘sunflowers’ with multiple leaves (‘petals’) such that 
# overplotting is visualized instead of accidental and invisiblec
detach(numbers)

Chapter 6 Tables

Useful when want to convey details , while graphics useful when want to convey effects.

# use of tables 
data <- read.table("Daphnia.txt", header = TRUE)
attach(data)
names(data)
## [1] "Growth.rate" "Water"       "Detergent"   "Daphnia"
table(Water, Detergent) # Waster as row 
##       Detergent
## Water  BrandA BrandB BrandC BrandD
##   Tyne      9      9      9      9
##   Wear      9      9      9      9
table(Detergent, Water) 
##          Water
## Detergent Tyne Wear
##    BrandA    9    9
##    BrandB    9    9
##    BrandC    9    9
##    BrandD    9    9

apply family

  • apply - When you want to apply a function to the rows or columns of a matrix (and higher-dimensional analogues); not generally advisable for data frames as it will coerce to a matrix first.

  • lapply - When you want to apply a function to each element of a list in turn and get a list back.

  • sapply - When you want to apply a function to each element of a list in turn, but you want a vector back, rather than a list.

  • tapply - For when you want to apply a function to subsets of a vector and the subsets are defined by some other vector, usually a factor

Reference

# summary tables 
tapply(Growth.rate, Detergent, mean)
##   BrandA   BrandB   BrandC   BrandD 
## 3.884832 4.010044 3.954512 3.558231
# two dim summary tables 
tapply(Growth.rate, list(Daphnia, Detergent), mean)
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.732227 2.929140 3.071335 2.626797
## Clone2 3.919002 4.402931 4.772805 5.213745
## Clone3 5.003268 4.698062 4.019397 2.834151
# summary table with self-defined function 
tapply(Growth.rate, list(Daphnia, Detergent), function(x) sqrt(var(x)/length(x)))
##           BrandA    BrandB    BrandC    BrandD
## Clone1 0.2163448 0.2319320 0.3055929 0.1905771
## Clone2 0.4702855 0.3639819 0.5773096 0.5520220
## Clone3 0.2688604 0.2683660 0.5395750 0.4260212
# 3-D table 
tapply(Growth.rate, list(Daphnia, Detergent, Water), mean)
## , , Tyne
## 
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.811265 2.775903 3.287529 2.597192
## Clone2 3.307634 4.191188 3.620532 4.105651
## Clone3 4.866524 4.766258 4.534902 3.365766
## 
## , , Wear
## 
##          BrandA   BrandB   BrandC   BrandD
## Clone1 2.653189 3.082377 2.855142 2.656403
## Clone2 4.530371 4.614673 5.925078 6.321838
## Clone3 5.140011 4.629867 3.503892 2.302537
# two lists based on two levels of the third variable 

# 3-D table : ftable (flat table)
ftable(tapply(Growth.rate, list(Daphnia, Detergent, Water), mean))
##                    Tyne     Wear
##                                 
## Clone1 BrandA  2.811265 2.653189
##        BrandB  2.775903 3.082377
##        BrandC  3.287529 2.855142
##        BrandD  2.597192 2.656403
## Clone2 BrandA  3.307634 4.530371
##        BrandB  4.191188 4.614673
##        BrandC  3.620532 5.925078
##        BrandD  4.105651 6.321838
## Clone3 BrandA  4.866524 5.140011
##        BrandB  4.766258 4.629867
##        BrandC  4.534902 3.503892
##        BrandD  3.365766 2.302537
# re-arrange the order rows/cols 

water <- factor(Water, levels = c("Wear", "Tyne")) # reorder the col 
ftable(tapply(Growth.rate, list(Daphnia, Detergent, water), mean))
##                    Wear     Tyne
##                                 
## Clone1 BrandA  2.653189 2.811265
##        BrandB  3.082377 2.775903
##        BrandC  2.855142 3.287529
##        BrandD  2.656403 2.597192
## Clone2 BrandA  4.530371 3.307634
##        BrandB  4.614673 4.191188
##        BrandC  5.925078 3.620532
##        BrandD  6.321838 4.105651
## Clone3 BrandA  5.140011 4.866524
##        BrandB  4.629867 4.766258
##        BrandC  3.503892 4.534902
##        BrandD  2.302537 3.365766
# apply extra argument to tapply 
tapply(Growth.rate, Detergent, mean, trim = 0.1)
##   BrandA   BrandB   BrandC   BrandD 
## 3.874869 4.019206 3.890448 3.482322
# tapply(Growth.rate, Detergent, mean, na.rm = TRUE) # remove possible na values to avoid error


# use tapply to create new summarized dataframe 

# first convert factors to numbers 
dets <- as.vector(tapply(as.numeric(Detergent), list(Detergent,Daphnia), mean))

levels(Detergent)[dets] # levels to be used for the summarized dataframe 
##  [1] "BrandA" "BrandB" "BrandC" "BrandD" "BrandA" "BrandB" "BrandC"
##  [8] "BrandD" "BrandA" "BrandB" "BrandC" "BrandD"
# similarly 
clones<-as.vector(tapply(as.numeric(Daphnia),list(Detergent,Daphnia),mean))
levels(Daphnia)[clones]
##  [1] "Clone1" "Clone1" "Clone1" "Clone1" "Clone2" "Clone2" "Clone2"
##  [8] "Clone2" "Clone3" "Clone3" "Clone3" "Clone3"
# save the corresponding mean values 
means <- as.vector(tapply(Growth.rate, list(Detergent,Daphnia), mean))
detergent <- levels(Detergent)[dets]
daphnia <- levels(Daphnia)[clones]

# method 1: use data.frame to construct a data frame 
data.frame(means,detergent,daphnia)
##       means detergent daphnia
## 1  2.732227    BrandA  Clone1
## 2  2.929140    BrandB  Clone1
## 3  3.071335    BrandC  Clone1
## 4  2.626797    BrandD  Clone1
## 5  3.919002    BrandA  Clone2
## 6  4.402931    BrandB  Clone2
## 7  4.772805    BrandC  Clone2
## 8  5.213745    BrandD  Clone2
## 9  5.003268    BrandA  Clone3
## 10 4.698062    BrandB  Clone3
## 11 4.019397    BrandC  Clone3
## 12 2.834151    BrandD  Clone3
# method 2: use as.data.frame.table 
new <- as.data.frame.table(tapply(Growth.rate,list(Detergent,Daphnia),mean))
names(new)<-c("detergents","daphina","means") # update the names 
head(new)
##   detergents daphina    means
## 1     BrandA  Clone1 2.732227
## 2     BrandB  Clone1 2.929140
## 3     BrandC  Clone1 3.071335
## 4     BrandD  Clone1 2.626797
## 5     BrandA  Clone2 3.919002
## 6     BrandB  Clone2 4.402931
detach(data)

Expanding a table into a dataframe

count.table <- read.table("tabledata.txt", header = TRUE)
attach(count.table)
head(count.table, 3) # expand the count to the count number rows 
##   count    sex   age condition
## 1    12   male young   healthy
## 2     7   male   old   healthy
## 3     9 female young   healthy
lapply(count.table, function(x) rep(x, count.table$count)) 
## $count
##  [1] 12 12 12 12 12 12 12 12 12 12 12 12  7  7  7  7  7  7  7  9  9  9  9
## [24]  9  9  9  9  9  8  8  8  8  8  8  8  8  6  6  6  6  6  6  7  7  7  7
## [47]  7  7  7  8  8  8  8  8  8  8  8  5  5  5  5  5
## 
## $sex
##  [1] male   male   male   male   male   male   male   male   male   male  
## [11] male   male   male   male   male   male   male   male   male   female
## [21] female female female female female female female female female female
## [31] female female female female female female male   male   male   male  
## [41] male   male   male   male   male   male   male   male   male   female
## [51] female female female female female female female female female female
## [61] female female
## Levels: female male
## 
## $age
##  [1] young young young young young young young young young young young
## [12] young old   old   old   old   old   old   old   young young young
## [23] young young young young young young old   old   old   old   old  
## [34] old   old   old   young young young young young young old   old  
## [45] old   old   old   old   old   young young young young young young
## [56] young young old   old   old   old   old  
## Levels: old young
## 
## $condition
##  [1] healthy     healthy     healthy     healthy     healthy    
##  [6] healthy     healthy     healthy     healthy     healthy    
## [11] healthy     healthy     healthy     healthy     healthy    
## [16] healthy     healthy     healthy     healthy     healthy    
## [21] healthy     healthy     healthy     healthy     healthy    
## [26] healthy     healthy     healthy     healthy     healthy    
## [31] healthy     healthy     healthy     healthy     healthy    
## [36] healthy     parasitized parasitized parasitized parasitized
## [41] parasitized parasitized parasitized parasitized parasitized
## [46] parasitized parasitized parasitized parasitized parasitized
## [51] parasitized parasitized parasitized parasitized parasitized
## [56] parasitized parasitized parasitized parasitized parasitized
## [61] parasitized parasitized
## Levels: healthy parasitized
# count.table will be coerced using as.list before applying the function 

# convert the list to dataframe 
dbtable <- as.data.frame(lapply(count.table, function(x) rep(x, count.table$count)))[, -1]
# [, -1] removes the first column , which is the original count 
head(dbtable, 3)
##    sex   age condition
## 1 male young   healthy
## 2 male young   healthy
## 3 male young   healthy
detach(count.table)

Converting from a dataframe to a table

The reverse procedure of the above section.

# just use the table function 
# 
table(dbtable)
## , , condition = healthy
## 
##         age
## sex      old young
##   female   8     9
##   male     7    12
## 
## , , condition = parasitized
## 
##         age
## sex      old young
##   female   5     8
##   male     7     6
str(table(dbtable)) # a table object 
##  'table' int [1:2, 1:2, 1:2] 8 7 9 12 5 7 8 6
##  - attr(*, "dimnames")=List of 3
##   ..$ sex      : chr [1:2] "female" "male"
##   ..$ age      : chr [1:2] "old" "young"
##   ..$ condition: chr [1:2] "healthy" "parasitized"
# convert it to dataframe 
as.data.frame(table(dbtable))
##      sex   age   condition Freq
## 1 female   old     healthy    8
## 2   male   old     healthy    7
## 3 female young     healthy    9
## 4   male young     healthy   12
## 5 female   old parasitized    5
## 6   male   old parasitized    7
## 7 female young parasitized    8
## 8   male young parasitized    6

Calculating tables of proportions with prop.table

counts <- matrix(c(1:20), nrow = 5, ncol = 4)
counts
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
prop.table(counts, 1) # by row 
##            [,1]      [,2]      [,3]      [,4]
## [1,] 0.02941176 0.1764706 0.3235294 0.4705882
## [2,] 0.05263158 0.1842105 0.3157895 0.4473684
## [3,] 0.07142857 0.1904762 0.3095238 0.4285714
## [4,] 0.08695652 0.1956522 0.3043478 0.4130435
## [5,] 0.10000000 0.2000000 0.3000000 0.4000000
prop.table(counts, 2)
##            [,1]  [,2]      [,3]      [,4]
## [1,] 0.06666667 0.150 0.1692308 0.1777778
## [2,] 0.13333333 0.175 0.1846154 0.1888889
## [3,] 0.20000000 0.200 0.2000000 0.2000000
## [4,] 0.26666667 0.225 0.2153846 0.2111111
## [5,] 0.33333333 0.250 0.2307692 0.2222222
prop.table(counts) # overall
##             [,1]       [,2]       [,3]       [,4]
## [1,] 0.004761905 0.02857143 0.05238095 0.07619048
## [2,] 0.009523810 0.03333333 0.05714286 0.08095238
## [3,] 0.014285714 0.03809524 0.06190476 0.08571429
## [4,] 0.019047619 0.04285714 0.06666667 0.09047619
## [5,] 0.023809524 0.04761905 0.07142857 0.09523810
sum(prop.table(counts))
## [1] 1

The scale function

scale is generic function whose default method centers and/or scales the columns of a numeric matrix.

scaled.data <- scale(counts)
scaled.data 
##            [,1]       [,2]       [,3]       [,4]
## [1,] -1.2649111 -1.2649111 -1.2649111 -1.2649111
## [2,] -0.6324555 -0.6324555 -0.6324555 -0.6324555
## [3,]  0.0000000  0.0000000  0.0000000  0.0000000
## [4,]  0.6324555  0.6324555  0.6324555  0.6324555
## [5,]  1.2649111  1.2649111  1.2649111  1.2649111
## attr(,"scaled:center")
## [1]  3  8 13 18
## attr(,"scaled:scale")
## [1] 1.581139 1.581139 1.581139 1.581139
# check the scaled effects  
apply(scaled.data, 2, mean)
## [1] 0 0 0 0
apply(scaled.data, 2, sd)
## [1] 1 1 1 1

The expand.grid function

Useful for generating tables from factorial combinations of factor levels.

expand.grid(height = seq(60, 70, 5), 
            weight = seq(1, 5, 2)) # use "=" instead of "<-" to assign the names correctly 
##   height weight
## 1     60      1
## 2     65      1
## 3     70      1
## 4     60      3
## 5     65      3
## 6     70      3
## 7     60      5
## 8     65      5
## 9     70      5

The model.matrix function

Useful for creating dummy variables

model.matrix creates a design (or model) matrix, e.g., by expanding factors to a set of dummary variables (depending on the contrasts) and expanding interactions similarly.

dd <- data.frame(a = gl(3,4), b = gl(4,1,12)) # balanced 2-way
options("contrasts")
## $contrasts
##         unordered           ordered 
## "contr.treatment"      "contr.poly"
model.matrix(~ a + b + a:b -1, dd)  # remove intercept 
##    a1 a2 a3 b2 b3 b4 a2:b2 a3:b2 a2:b3 a3:b3 a2:b4 a3:b4
## 1   1  0  0  0  0  0     0     0     0     0     0     0
## 2   1  0  0  1  0  0     0     0     0     0     0     0
## 3   1  0  0  0  1  0     0     0     0     0     0     0
## 4   1  0  0  0  0  1     0     0     0     0     0     0
## 5   0  1  0  0  0  0     0     0     0     0     0     0
## 6   0  1  0  1  0  0     1     0     0     0     0     0
## 7   0  1  0  0  1  0     0     0     1     0     0     0
## 8   0  1  0  0  0  1     0     0     0     0     1     0
## 9   0  0  1  0  0  0     0     0     0     0     0     0
## 10  0  0  1  1  0  0     0     1     0     0     0     0
## 11  0  0  1  0  1  0     0     0     0     1     0     0
## 12  0  0  1  0  0  1     0     0     0     0     0     1
## attr(,"assign")
##  [1] 1 1 1 2 2 2 3 3 3 3 3 3
## attr(,"contrasts")
## attr(,"contrasts")$a
## [1] "contr.treatment"
## 
## attr(,"contrasts")$b
## [1] "contr.treatment"
# model.matrix(~ a + b, dd, contrasts = list(a = "contr.sum"))
# model.matrix(~ a + b, dd, contrasts = list(a = "contr.sum", b = "contr.poly"))
# m.orth <- model.matrix(~a+b, dd, contrasts = list(a = "contr.helmert"))
# crossprod(m.orth) # m.orth is  ALMOST  orthogonal

Comparing table and tabulate

table is preferred.